Dataset Link: https://www.openml.org/search?type=data&sort=runs&id=31&status=active
I find that openml has a better description of the data. But the file is not in csv or excel format.
You can search online of how to convert the .arff file to .csv file.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import hvplot.pandas
pd.set_option('display.float', '{:.2f}'.format)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)
# read the data file
data = pd.read_csv('../Data/UCI_German/german_credit.csv')
# remove single quote in the columns names
data = data.rename(columns=lambda x: x.replace("'", ""))
data.head(10)
| checking_status | duration | credit_history | purpose | credit_amount | savings_status | employment | installment_commitment | personal_status | other_parties | residence_since | property_magnitude | age | other_payment_plans | housing | existing_credits | job | num_dependents | own_telephone | foreign_worker | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | '<0' | 6 | 'critical/other existing credit' | radio/tv | 1169 | 'no known savings' | '>=7' | 4 | 'male single' | none | 4 | 'real estate' | 67 | none | own | 2 | skilled | 1 | yes | yes | good |
| 1 | '0<=X<200' | 48 | 'existing paid' | radio/tv | 5951 | '<100' | '1<=X<4' | 2 | 'female div/dep/mar' | none | 2 | 'real estate' | 22 | none | own | 1 | skilled | 1 | none | yes | bad |
| 2 | 'no checking' | 12 | 'critical/other existing credit' | education | 2096 | '<100' | '4<=X<7' | 2 | 'male single' | none | 3 | 'real estate' | 49 | none | own | 1 | 'unskilled resident' | 2 | none | yes | good |
| 3 | '<0' | 42 | 'existing paid' | furniture/equipment | 7882 | '<100' | '4<=X<7' | 2 | 'male single' | guarantor | 4 | 'life insurance' | 45 | none | 'for free' | 1 | skilled | 2 | none | yes | good |
| 4 | '<0' | 24 | 'delayed previously' | 'new car' | 4870 | '<100' | '1<=X<4' | 3 | 'male single' | none | 4 | 'no known property' | 53 | none | 'for free' | 2 | skilled | 2 | none | yes | bad |
| 5 | 'no checking' | 36 | 'existing paid' | education | 9055 | 'no known savings' | '1<=X<4' | 2 | 'male single' | none | 4 | 'no known property' | 35 | none | 'for free' | 1 | 'unskilled resident' | 2 | yes | yes | good |
| 6 | 'no checking' | 24 | 'existing paid' | furniture/equipment | 2835 | '500<=X<1000' | '>=7' | 3 | 'male single' | none | 4 | 'life insurance' | 53 | none | own | 1 | skilled | 1 | none | yes | good |
| 7 | '0<=X<200' | 36 | 'existing paid' | 'used car' | 6948 | '<100' | '1<=X<4' | 2 | 'male single' | none | 2 | car | 35 | none | rent | 1 | 'high qualif/self emp/mgmt' | 1 | yes | yes | good |
| 8 | 'no checking' | 12 | 'existing paid' | radio/tv | 3059 | '>=1000' | '4<=X<7' | 2 | 'male div/sep' | none | 4 | 'real estate' | 61 | none | own | 1 | 'unskilled resident' | 1 | none | yes | good |
| 9 | '0<=X<200' | 30 | 'critical/other existing credit' | 'new car' | 5234 | '<100' | unemployed | 4 | 'male mar/wid' | none | 2 | car | 28 | none | own | 2 | 'high qualif/self emp/mgmt' | 1 | none | yes | bad |
data.describe()
| duration | credit_amount | installment_commitment | residence_since | age | existing_credits | num_dependents | |
|---|---|---|---|---|---|---|---|
| count | 1000.00 | 1000.00 | 1000.00 | 1000.00 | 1000.00 | 1000.00 | 1000.00 |
| mean | 20.90 | 3271.26 | 2.97 | 2.85 | 35.55 | 1.41 | 1.16 |
| std | 12.06 | 2822.74 | 1.12 | 1.10 | 11.38 | 0.58 | 0.36 |
| min | 4.00 | 250.00 | 1.00 | 1.00 | 19.00 | 1.00 | 1.00 |
| 25% | 12.00 | 1365.50 | 2.00 | 2.00 | 27.00 | 1.00 | 1.00 |
| 50% | 18.00 | 2319.50 | 3.00 | 3.00 | 33.00 | 1.00 | 1.00 |
| 75% | 24.00 | 3972.25 | 4.00 | 4.00 | 42.00 | 2.00 | 1.00 |
| max | 72.00 | 18424.00 | 4.00 | 4.00 | 75.00 | 4.00 | 2.00 |
print(data["class"].value_counts())
data["class"].value_counts().hvplot(kind='bar', x='index', y='class', title='Class')
good 700 bad 300 Name: class, dtype: int64
plt.figure(figsize=(8, 5))
sns.heatmap(data.corr(), annot=True, cmap='viridis')
<AxesSubplot:>
feature = 'duration'
hist = data.hvplot.hist(y=feature, by='class', bins=20, alpha=0.5, height=250, width=500)
kde = data.hvplot.kde(y=feature, by='class', line_width=3, height=250, width=500)
result = hist + kde
result.cols(1)
feature = 'credit_amount'
hist = data.hvplot.hist(y=feature, by='class', bins=20, alpha=0.5, height=250, width=500)
kde = data.hvplot.kde(y=feature, by='class', line_width=3, height=250, width=500)
result = hist + kde
result.cols(1)
feature = 'age'
hist = data.hvplot.hist(y=feature, by='class', bins=20, alpha=0.5, height=250, width=500)
kde = data.hvplot.kde(y=feature, by='class', line_width=3, height=250, width=500)
result = hist + kde
result.cols(1)
print(data["checking_status"].value_counts())
data["checking_status"].value_counts().hvplot(kind='bar', x='index', y='checking_status', title='Checking Status')
'no checking' 394 '<0' 274 '0<=X<200' 269 '>=200' 63 Name: checking_status, dtype: int64
print(data['credit_history'].value_counts())
data['credit_history'].value_counts().hvplot(kind='bar', x='index', y='credit_history', title='Credit History')
'existing paid' 530 'critical/other existing credit' 293 'delayed previously' 88 'all paid' 49 'no credits/all paid' 40 Name: credit_history, dtype: int64
print(data["purpose"].value_counts())
data["purpose"].value_counts().hvplot(kind='bar', x='index', y='purpose', title='Purpose')
radio/tv 280 'new car' 234 furniture/equipment 181 'used car' 103 business 97 education 50 repairs 22 'domestic appliance' 12 other 12 retraining 9 Name: purpose, dtype: int64
print(data["savings_status"].value_counts())
data["savings_status"].value_counts().hvplot(kind='bar', x='index', y='savings_status', title='Saving Status')
'<100' 603 'no known savings' 183 '100<=X<500' 103 '500<=X<1000' 63 '>=1000' 48 Name: savings_status, dtype: int64
print(data["employment"].value_counts())
data["employment"].value_counts().hvplot(kind='bar', x='index', y='employment', title='Employment')
'1<=X<4' 339 '>=7' 253 '4<=X<7' 174 '<1' 172 unemployed 62 Name: employment, dtype: int64
print(data["installment_commitment"].value_counts())
data["installment_commitment"].value_counts().hvplot(kind='bar', x='index', y='installment_commitment', title='Installment Commitment')
4 476 2 231 3 157 1 136 Name: installment_commitment, dtype: int64
print(data["personal_status"].value_counts())
data["personal_status"].value_counts().hvplot(kind='bar', x='index', y='personal_status', title='Personal Status')
'male single' 548 'female div/dep/mar' 310 'male mar/wid' 92 'male div/sep' 50 Name: personal_status, dtype: int64
print(data["other_parties"].value_counts())
data["other_parties"].value_counts().hvplot(kind='bar', x='index', y='other_parties', title='Other Parties')
none 907 guarantor 52 'co applicant' 41 Name: other_parties, dtype: int64
print(data["residence_since"].value_counts())
data["residence_since"].value_counts().hvplot(kind='bar', x='index', y='residence_since', title='Residence Since')
4 413 2 308 3 149 1 130 Name: residence_since, dtype: int64
print(data["property_magnitude"].value_counts())
data["property_magnitude"].value_counts().hvplot(kind='bar', x='index', y='property_magnitude', title='Property Magnitude')
car 332 'real estate' 282 'life insurance' 232 'no known property' 154 Name: property_magnitude, dtype: int64
print(data["other_payment_plans"].value_counts())
data["other_payment_plans"].value_counts().hvplot(kind='bar', x='index', y='other_payment_plans', title='Other Payment Plans')
none 814 bank 139 stores 47 Name: other_payment_plans, dtype: int64
print(data["housing"].value_counts())
data["housing"].value_counts().hvplot(kind='bar', x='index', y='housing', title='Housing')
own 713 rent 179 'for free' 108 Name: housing, dtype: int64
print(data["existing_credits"].value_counts())
data["existing_credits"].value_counts().hvplot(kind='bar', x='index', y='existing_credits', title='Existing Credits')
1 633 2 333 3 28 4 6 Name: existing_credits, dtype: int64
print(data["job"].value_counts())
data["job"].value_counts().hvplot(kind='bar', x='index', y='job', title='Job')
skilled 630 'unskilled resident' 200 'high qualif/self emp/mgmt' 148 'unemp/unskilled non res' 22 Name: job, dtype: int64
print(data["num_dependents"].value_counts())
data["num_dependents"].value_counts().hvplot(kind='bar', x='index', y='num_dependents', title='Num Dependents')
1 845 2 155 Name: num_dependents, dtype: int64
print(data["own_telephone"].value_counts())
data["own_telephone"].value_counts().hvplot(kind='bar', x='index', y='own_telephone', title='Telephone Owner')
none 596 yes 404 Name: own_telephone, dtype: int64
print(data["foreign_worker"].value_counts())
data["foreign_worker"].value_counts().hvplot(kind='bar', x='index', y='foreign_worker', title='Foreign Worker')
yes 963 no 37 Name: foreign_worker, dtype: int64
from sklearn.metrics import confusion_matrix, roc_auc_score
def get_experiment_result(y_true, y_pred_proba):
cm = confusion_matrix(y_true, np.round(y_pred_proba))
TP = cm[1,1]
FP = cm[0,1]
FN = cm[1,0]
TN = cm[0,0]
a = TP/(TP+FN)
b = TN/(TN+FP)
auc = roc_auc_score(y_true, y_pred_proba)
print("Non-Default Recall: %.3f%%" % (b))
print("Default Recall: %.3f%%" % (a))
print("AUC: %.3f%%" % (auc))
# Mapping the labels
data['class'] = data['class'].map({'good': 0, 'bad': 1})
# One-hot Encoding the categorical features
categorical_feature = ['checking_status', 'credit_history', 'purpose', 'savings_status',
'employment', 'personal_status', 'other_parties', 'property_magnitude',
'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']
data = pd.get_dummies(data, columns=categorical_feature, prefix= categorical_feature)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
if "Set" not in data.columns:
data["Set"] = np.random.choice(["train", "test"], p =[.8, .2], size=(data.shape[0],))
train_indices = data[data.Set=="train"].index
test_indices = data[data.Set=="test"].index
data = data.drop(columns=['Set'])
y_train = data["class"].values[train_indices]
y_test = data["class"].values[test_indices]
data = data.drop("class", axis=1)
X_train = data.values[train_indices]
X_test = data.values[test_indices]
print("Training Instances: ", X_train.shape[0])
print("Test Instances: ", X_test.shape[0])
np.bincount(y_train)
Training Instances: 804 Test Instances: 196
array([568, 236])
from imblearn.under_sampling import RandomUnderSampler
# Resample using random undersampling
rus = RandomUnderSampler(random_state=42)
X_train, y_train = rus.fit_resample(X_train, y_train)
# NORMALIZE & CONVERT TO NUMPY ARRAYS
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
print("Training Instances: ", X_train.shape[0])
print("Test Instances: ", X_test.shape[0])
np.bincount(y_train)
Training Instances: 472 Test Instances: 196
array([236, 236])
from xgboost import XGBClassifier
# Create an XGBoost classifier object
xgb_clf = XGBClassifier()
# Train the XGBoost classifier on the training data
xgb_clf.fit(X_train, y_train)
y_pred_proba = xgb_clf.predict_proba(X_test)
get_experiment_result(y_test, y_pred_proba[:, 1])
Non-Default Recall: 0.720% Default Recall: 0.656% AUC: 0.756%
# NORMALIZE & CONVERT TO NUMPY ARRAYS
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
from xgboost import XGBClassifier
# Create an XGBoost classifier object
xgb_clf = XGBClassifier()
# Train the XGBoost classifier on the training data
xgb_clf.fit(X_train, y_train)
y_pred_proba = xgb_clf.predict_proba(X_test)
get_experiment_result(y_test, y_pred_proba[:, 1])
Non-Default Recall: 0.802% Default Recall: 0.591% AUC: 0.772%